# topic 11 g # Load the functions we will need in this script source( "../assess_normality.R") source( "../pop_sd.R") # # Here we are looking to answer questions about # probabilities for proportions. So a typical # problem might be: # # We know the true proportion of some characteristic # in a population is 65%. That is, for the whole # population, 65% of all things in the population # have this characteristic. Then, if we take a # sample of size 88 from this population, what is # the probability that the proportion of the # sample that has that characteristic is less # than 55%? # As long as we have the time, let us create a # large population that has 65% of the population # having the characteristic and 35% not having # it. big_pop <- c( rep(1, 6500), rep(2,3500)) # # I have had students who do not like having the # the values in big_pop be in this nice order, i.e., # 6500 ones and then 3500 twos. For those students # we can shuffle the items in big_pop. source("../shuffle.R") big_pop <- shuffle( big_pop ) head( big_pop, 30) # # Now, take 10,000 samples, each of size 88, # from that population and record the # proportion of the sample that has the # characteristic in L1 L1 <- 1:10000 for ( i in 1:10000){ # get a sample of size 88 L2 <- sample( big_pop, 88) # find the proportion of items with a 1 # in the sample num_times <- length( L2[ L2 == 1]) this_proportion <- num_times/88 # add that to our list in L1 L1[i] <- this_proportion } # Now let us look at the distribution of the # values in L1 # summary( L1 ) # compare mean and median boxplot( L1, horizontal=TRUE) hist(L1 ) assess_normality( L1 ) pop_sd( L1 ) mean( L1 ) # compare those to the mathematically # predicted mean and standard deviation # the mean should be p which is 0.65 # the standard deviation should be # sqrt( p*(1-p)/n) sqrt( 0.65*0.35/88 ) # # So what we see is that we can use the normal # distribution with mean=p and # sd = sqrt(p*(1-p)/n) to answer questions # about the probability associated with # a known proportion. ####################### ## small diversion...why the drop in the ## histogram for the interval 0.606 to 0.608? ## let us look at the possible outcomes outcomes <- (1:88)/88 outcomes # then look at the number of possible # outcomes in the 0.64 to 0.66, in 0.66 to 0.68, # and in 0.68 to 0.70 outcomes[ outcomes>0.64 & outcomes<= 0.66] outcomes[ outcomes>0.66 & outcomes<= 0.68] outcomes[ outcomes>0.68 & outcomes<= 0.70] # this explains the strange low value here # and at a few other places. # we would not see this if we took samples # of size 100 # # let us do that.... ######################### for ( i in 1:10000){ # get a sample of size 100 L2 <- sample( big_pop, 100) # find the proportion of items with a 1 # in the sample num_times <- length( L2[ L2 == 1]) this_proportion <- num_times/100 # add that to our list in L1 L1[i] <- this_proportion } # Now let us look at the distribution of the # values in L1 # summary( L1 ) #compare mean and median boxplot( L1, horizontal=TRUE) hist(L1 ) assess_normality( L1 ) pop_sd( L1 ) mean( L1 ) # compare those to the mathematically # predicted mean and standard deviation # the mean should be p which is 0.65 # the standard deviation should be # sqrt( p*(1-p)/n) sqrt( 0.65*0.35/100 ) # Then go back to the original question. # # We know the true proportion of some characteristic # in a population is 65%. That is, for the whole # population, 65% of all things in the population # have this characteristic. Then, if we take a # sample of size 88 from this population, what is # the probability that the proportion of the # sample that has that characteristic is less # than 55%? # This is just the same as asking "For a # normal distribution, # N( 0.65, sqrt(0.65*(1-0.65)/88)), what is # P(X < 0.55)? # But we know how to do that: pnorm( 0.55, mean=0.65, sd=sqrt(0.65*0.35/88)) # for a population with a characteristics that # is known to be in 58% of the population, if # we take a sample of size 37, what is the # probability that the sample will show a # proportion greater than 63? pnorm( 0.63, mean=0.58, sd=sqrt(0.58*(1-0.58)/37), lower.tail=FALSE) # If we know that the proportion of people who # will vote for candidate A in the next election # is 53%, then in a sample of size 734 what is # the probability that the proportion of voters # for candidate A the sample will be less than # 49% or greater than 57%? pnorm( 0.49, 0.53, sqrt(0.53*0.47/734))+ pnorm( 0.57, 0.53, sqrt(0.53*0.47/734), lower.tail = FALSE) ############################################ ############################################ # Is this normal approximation always good? ############################################ ## ## Look at a new case, one where n*p<10 ## ## consider the case where ## p=0.15, what if our sample size was 12? ## what would our experiment of 10,000 ## samples look like? big_pop <- c(rep(1,1500),c(rep(2,8500))) for ( i in 1:10000){ # get a sample of size 12 L2 <- sample( big_pop, 12) # find the proportion of items with a 1 # in the sample num_times <- length( L2[ L2 == 1]) this_proportion <- num_times/12 # add that to our list in L1 L1[i] <- this_proportion } # Now let us look at the distribution of the # values in L1 # summary( L1 ) # compare mean and median boxplot( L1, horizontal=TRUE) hist(L1 ) assess_normality( L1 ) pop_sd( L1 ) mean( L1 ) # compare those to the mathematically # predicted mean and standard deviation # the mean should be p which is 0.65 # the standard deviation should be # sqrt( p*(1-p)/n) sqrt( 0.15*0.85/12 ) # # Compare the approximation to the model # for getting P(X<0.05) pnorm( 0.05, mean=0.15, sd=sqrt(0.15*0.85/12)) quantile(L1,0.1659877) # So we have a rule: if n*p>=10 and # if n*(1-p)>=10 then we can use the # normal approximation for the probabilities. ###################################### ## We seem to do the same thing each time we ## run into this problem, that is, we use the ## population proportion as the mean and the ## expression sqrt( p*(1-p)/n) as the standard ## deviation. Could we put this into a ## function? Yes, look at pprop(). source("../pprop.R") # # remember that we did pnorm( 0.55, mean=0.65, sd=sqrt(0.65*0.35/88)) ## now try pprop(0.55, 0.65, 88) # or we did pnorm( 0.63, mean=0.58, sd=sqrt(0.58*(1-0.58)/37), lower.tail=FALSE) # now try pprop( 0.63, 0.58, 37, lower.tail=FALSE)